The following code is largely based on the Geodemographics & Data Reduction lab from the Urban Analytics text by Singleton (2017) It builds a geodemographic classification by in turn following the methodology used to create the UK Output Area Classification in a single city context: Liverpool. An interpretation of the final cluster map can be found in bold below.

Data Import and Preparation

We first load a series of tables: “Census_2011_Count_All” is a data frame containing census data for the UK, which are detailed in “Variable_Desc”. The “Liverpool” data frame is a list of output areas within Liverpool, UK, while “OAC_Input_Lookup” is a lookup for the variables used to create OAC.

# Load data
load("./data/census_2011_UK_OA.RData")

We now create a subset of the input data for Liverpool by merging the data frame that contain output areas as well as census data:

#Crop
Census_2011_Count <- merge(Liverpool,Census_2011_Count_All,by="OA",all.x=TRUE)

The first stage in the analysis is to create aggregations for the input data. The following code calculates all of the numerators:

OAC_Input <- as.data.frame(Census_2011_Count$OA)
colnames(OAC_Input) <- "OA"

# Loops through each row in the OAC input table
for (n in 1:nrow(OAC_Input_Lookup)){

      # Gets the variables to aggregate for the row specified by n
      select_vars <- OAC_Input_Lookup[n,"England_Wales"]
      
      # Creates a list of the variables to select
      select_vars <- unlist(strsplit(paste(select_vars),","))
      
      # Creates variable name
      vname <- OAC_Input_Lookup[n,"VariableCode"] 
      
      # Creates a sum of the census variables for each Output Area
      tmp <- data.frame(rowSums(Census_2011_Count[,select_vars, drop=FALSE]))
      colnames(tmp) <- vname
      
      # Appends new variable to the OAC_Input object
      OAC_Input <- cbind(OAC_Input,tmp)
      
      # Removes temporary objects
      remove(list = c("vname","tmp"))

} # END: Loop through each row in the OAC input table

# The variables for k035 were included but this is merely for coding simplicity above, as the Standardized Illness Ratio is calculated later; as such we will remove this from the numerator data frame
OAC_Input$k035 <- NULL

We now create another data frame which contains the denominators…

OAC_Input_den <- as.data.frame(Census_2011_Count$OA)
colnames(OAC_Input_den) <- "OA"

# Create a list of unique denominators
den_list <- unique(OAC_Input_Lookup[,"Denominator"])
den_list <- paste(den_list[den_list != ""])

# Select denominators
OAC_Input_den <- Census_2011_Count[,c("OA",den_list)]

… so as to then merge it with the numerators:

#Merge
OAC_Input <- merge(OAC_Input,OAC_Input_den, by="OA")

To match numerators and denominators and calculate percerntages we use a loop that runs through a list of variable names created as follows:

# Get numerator denominator list where the Type is "Count" - i.e. not ratio
K_Var <- OAC_Input_Lookup[OAC_Input_Lookup$Type == "Count",c(1,3)]
# Creates an OA list / data frame
OAC_Input_PCT_RATIO <- subset(OAC_Input, select = "OA")

# Loop
for (n in 1:nrow(K_Var)){
  
  num <- paste(K_Var[n,"VariableCode"]) # Gets numerator name
  den <- paste(K_Var[n,"Denominator"]) # Gets denominator name
  tmp <- data.frame(OAC_Input[,num] / OAC_Input[,den] * 100) # Calculates percentages
  colnames(tmp) <- num
  OAC_Input_PCT_RATIO <- cbind(OAC_Input_PCT_RATIO,tmp) # Appends the percentages
  
  # Remove temporary objects
  remove(list = c("tmp","num","den"))
}

The final two variables include density (k007), which can be joined from the original census table:

#Extract Variable
tmp <- Census_2011_Count[,c("OA","KS101EW0008")]
colnames(tmp) <- c("OA","k007")

#Merge
OAC_Input_PCT_RATIO <- merge(OAC_Input_PCT_RATIO,tmp,by="OA")

We now calculate the variable k035, which was the standardized illness rate (SIR) and in turn needs to be calculated for each subset of the national data, in this case Liverpool:

# Calculates rates of ill people 15 or less and greater than or equal to 65
ill_16_64 <- rowSums(Census_2011_Count[,c("KS301EW0005","KS301EW0006")]) # Ill people 16-64
ill_total <-   rowSums(Census_2011_Count[,c("KS301EW0002","KS301EW0003")]) # All ill people
ill_L15_G65 <- ill_total - ill_16_64 # Ill people 15 or less and greater than or equal to 65

# Calculates total people 15 or less and greater than or equal to 65
t_pop_16_64 <- rowSums(Census_2011_Count[,c("KS102EW0007","KS102EW0008","KS102EW0009","KS102EW0010","KS102EW0011","KS102EW0012","KS102EW0013")]) # People 16-64
t_pop <- Census_2011_Count$KS101EW0001 # All people
t_pop_L15_G65 <- t_pop - t_pop_16_64 # All people 15 or less and greater than or equal to 65

# Calculates expected rate
ex_ill_16_64 <- t_pop_16_64 * (sum(ill_16_64)/sum(t_pop_16_64)) # Expected ill 16-64
ex_ill_L15_G65 <- t_pop_L15_G65 * (sum(ill_L15_G65)/sum(t_pop_L15_G65)) # Expected ill people 15 or less and greater than or equal to 65

ex_ill <- ex_ill_16_64 + ex_ill_L15_G65 # total expected ill people

# Ratio
SIR <- as.data.frame(ill_total / ex_ill * 100) # ratio between ill people and expected ill people
colnames(SIR) <- "k035"

# Merges data
OAC_Input_PCT_RATIO <- cbind(OAC_Input_PCT_RATIO,SIR)

# Removes unwanted objects
remove(list=c("SIR","ill_16_64","ill_total","ill_L15_G65","t_pop_16_64","t_pop","t_pop_L15_G65","ex_ill_16_64","ex_ill_L15_G65","ex_ill"))

We now apply the two standardization and normalization procedures to the input data, namely inverse hyperbolic sine and range standardization.

# Calculates inverse hyperbolic sine
OAC_Input_PCT_RATIO_IHS <- log(OAC_Input_PCT_RATIO[,2:61]+sqrt(OAC_Input_PCT_RATIO[,2:61]^2+1))

# Calculates range
range_01 <- function(x){(x-min(x))/(max(x)-min(x))} # range function
OAC_Input_PCT_RATIO_IHS_01 <- apply(OAC_Input_PCT_RATIO_IHS, 2, range_01) # applies range function to columns

# Adds the OA codes back onto the data frame as row names
rownames(OAC_Input_PCT_RATIO_IHS_01) <- OAC_Input_PCT_RATIO$OA

Estimating the number of clusters

The following is quoted directly from the Lab:

"We have now created a subset of 1584 Output Areas for the extent of Liverpool with inputs that have mirrored the attributes, measures, transformation and standardization methods used for the UK OAC 2011 classification. Prior to clustering this bespoke Liverpool classification, it is worth considering what would be an appropriate number of clusters for the initial Super Group (most aggregate) level.

This can be considered by running some test cluster analysis with different cluster frequency (k), and for each result, examining a statistic called the total within sum of squares. This is a measure of how well the cluster frequency fits the data - essentially the mean standardized distance of the data observations to a cluster mean. These tests are typically plotted, with the purpose to identify an “elbow criterion” which is a visual indication of where an appropriate cluster frequency might be set. The trade off you are looking for is the impact of increasing cluster frequency (i.e. making a more complex classification) versus a reduction in this score."

library(ggplot2)

# Creates a new empty numeric object to store the wss results
wss <- numeric()

# Runs k means for 2-12 clusters and store the wss results
for (i in 2:12) wss[i] <- sum(kmeans(OAC_Input_PCT_RATIO_IHS_01, centers=i,nstart=20)$withinss)

# Creates a data frame with the results, adding a further column for the cluster number
wss <- data.frame(2:12,wss[-1])

# Plots the results
names(wss) <- c("k","Twss")
ggplot(data=wss, aes(x= k, y=Twss)) + geom_path() + geom_point() + scale_x_continuous(breaks=2:12) + labs(y = "Total within sum of squares")

Since there are no large decreases in the within sum of squares and there is a minor moderation of the decrease around 7 or 8 clusters, a 7 cluster solution was chosen.

Building the geodemographic

Now that we have chosen seven clusters, we now consider how the partitioning can be optimized.

# Load cluster object
load("./data/cluster_7.Rdata")

The cluster results can be accessed as follows:

# Lookup Table
lookup <- data.frame(cluster_7$cluster)
# Add OA codes
lookup$OA <- rownames(lookup)
colnames(lookup) <- c("K_7","OA")
# Recode clusters as letter
lookup$SUPER <- LETTERS[lookup$K_7]

It is also important to look at the distribution of these clusters:

table(lookup$K_7)
## 
##   1   2   3   4   5   6   7 
## 259 340 279 334 109  73 190

Here, a map of the clusters is produced. Note the changes from the lab, which include the title, the type of palette, and the border line color

# Load packages
library(rgdal)
## Loading required package: sp
## rgdal: version: 1.5-8, (SVN revision 990)
## Geospatial Data Abstraction Library extensions to R successfully loaded
## Loaded GDAL runtime: GDAL 2.4.2, released 2019/06/28
## Path to GDAL shared files: /Library/Frameworks/R.framework/Versions/4.0/Resources/library/rgdal/gdal
## GDAL binary built with GEOS: FALSE 
## Loaded PROJ runtime: Rel. 5.2.0, September 15th, 2018, [PJ_VERSION: 520]
## Path to PROJ shared files: /Library/Frameworks/R.framework/Versions/4.0/Resources/library/rgdal/proj
## Linking to sp version:1.4-2
## To mute warnings of possible GDAL/OSR exportToProj4() degradation,
## use options("rgdal_show_exportToProj4_warnings"="none") before loading rgdal.
library(tmap)
library(rgeos)
## rgeos version: 0.5-3, (SVN revision 634)
##  GEOS runtime version: 3.7.2-CAPI-1.11.2 
##  Linking to sp version: 1.4-1 
##  Polygon checking: TRUE
# Import OA boundaries (taking out "OGRGeoJSON")
liverpool_SP <- readOGR("./data/Liverpool_OA_2011.geojson")
## OGR data source with driver: GeoJSON 
## Source: "/Users/Marcos/Downloads/Geographic Information Science III/Lab 6/urban_analytics/10_Data_Reduction_Geodemographics/data/Liverpool_OA_2011.geojson", layer: "Liverpool_OA_2011"
## with 1584 features
## It has 1 fields
# Merge lookup
liverpool_SP <- merge(liverpool_SP, lookup, by.x="oa_code",by.y="OA")

m <- tm_shape(liverpool_SP, projection=27700) +
    tm_polygons(col="SUPER", border.col = "grey30",   palette="Pastel1",border.alpha = .3, title="Clusters of the city of Liverpool", showNA=FALSE) +
  tm_layout(legend.position = c("left", "bottom"), frame = FALSE)

#Create leaflet plot
tmap_leaflet(m)
## Warning: The shape liverpool_SP is invalid. See sf::st_is_valid
## Linking to GEOS 3.7.2, GDAL 2.4.2, PROJ 5.2.0
## legend.postion is used for plot mode. Use view.legend.position in tm_view to set the legend position in view mode.

To be honest, the interpretation of the map left me somewhat confused because of the patterns that I saw. If I interpreted the data correctly, in this map as well as in the tables that are below, it appears like there are clear patterns in terms of these clusters. The suburbs, where Clusters A and C, but particularly B and D are located, usually show the best results in terms of socioeconomic indicators (which range from English proficiency to the availability of private transport or the ownership of their own house). On the other hand, as we move towards the city center, results start to worsen; Clusters E, G, and particularly F, show much worse outcomes than the others. Notably, all three are contiguous: These are clusters where people tend to be younger, and also where immigrants and minorities appear to concentrate.

Creating cluster descriptions and profiles

We now contextualize the cluster assignments further by looking at the rates for input attributes within each cluster compared to the Liverpool average by creating a table of index scores.

# Merges Original Data (inc. denominators)
LiVOAC_Lookup_Input <- merge(lookup,OAC_Input,by="OA",all.x=TRUE)

# Removes Ratio Variables
LiVOAC_Lookup_Input$k007 <- NULL
LiVOAC_Lookup_Input$k035 <- NULL

# Creates Aggregations by SuperGroup
SuperGroup <-aggregate(LiVOAC_Lookup_Input[,4:78], by=list(LiVOAC_Lookup_Input$SUPER),  FUN=sum)

# Creates a data frame that will be used to append the index scores
G_Index <- data.frame(SUPER=LETTERS[1:7])

# Loop
for (n in 1:nrow(K_Var)){
  
  num <- paste(K_Var[n,"VariableCode"]) # Gets numerator name
  den <- paste(K_Var[n,"Denominator"]) # Gets denominator name
  tmp <- data.frame(round((SuperGroup[,num] / SuperGroup[,den]) / (sum(SuperGroup[,num])/sum(SuperGroup[,den]))*100)) # Calculates index score - these are also rounded
  colnames(tmp) <- num
  
  G_Index <- cbind(G_Index,tmp) # Appends the index calculations
  
  # Removes temporary objects
  remove(list = c("tmp","num","den"))
}

# Views the index scores
G_Index
##   SUPER k001 k002 k003 k004 k005 k006 k008 k009 k010 k011 k012 k013 k014 k015
## 1     A   83   91   91  114  147  237   90   92   84  144  106   81   38   40
## 2     B   90  109   84  129  124  121   23   65  164   71  106   60   97   92
## 3     C  125  104  115   98   87   66    8   98  105  102  106   78   71   56
## 4     D  121  129   92  104  108   75   31   95   98  117  107   63   59   23
## 5     E   45   21  184   59   33   41   98  152   42   80   89  171  210  197
## 6     F   35   31   62   32   30   37  933  171   29   33   84  137  280  348
## 7     G  129  113  120   83   73   50   20  112   76  125   77  238  139  195
##   k016 k017 k018 k019 k020 k021 k022 k023 k024 k025 k026 k027 k028 k029 k030
## 1   41   43   59   46  105   60   73   51   73   95    6   51   89   82  155
## 2   48   73   25   32  105   68   29   44  142  130   11  317  221   24   30
## 3   44   47   48   38  104   81   75   55  115  100   58   30   45  185   43
## 4   29   39   49   22  106   41   76   57   79  140    4   66  119  147   11
## 5   79  171  146  275   86  432  186  136  144   14  283   14    9    8  375
## 6  393  415  131  143   86  199  116  148   71   42 1040   31   26  101  199
## 7  305  186  408  408   84  134  291  357   68   70  128   57   59  109  143
##   k031 k032 k033 k034 k036 k037 k038 k039 k040 k041 k042 k043 k044 k045 k046
## 1   70  183   61   92  109  100   62   64   44   57   97   83   83  128   98
## 2  181   11   43   23  123  110   84  143   54  236   80  157   61   53   91
## 3  127   31  128   59   98  113   94  105   64   93  128  115   98   97   90
## 4   87  164   48   69  113  117   68   45   54   65  104   87   89  132  111
## 5   39   67  263  302   53   57  112  227  148   61  105   87  260   79   64
## 6   54   71  231  261   42   47  322   97  480   77   72   42  114   38  178
## 7   48  152  140  159   82   93   84   88  101   39  111   65  117  158  112
##   k047 k048 k049 k050 k051 k052 k053 k054 k055 k056 k057 k058 k059 k060
## 1  101   56  114  112  107  103  126   90   76   93  120   99   84  102
## 2  104   73  115  106  104   87   94   58  119  121   70  125  124   96
## 3  104   98  100  100  104   98  106   80   97  107   92  114  104  104
## 4   95  110  120  121  123  113  126   93   54   82  136   87   68  110
## 5  117   98   48   67   61   78   58  132  209  117   78   82  121   90
## 6   64  295   37   43   25  143   44  258  102   60   76   54  105   75
## 7   95  108   81   90  102  105   89  165   82   80  137   71   86  103

“To assist with spotting trends within the grand index table we can visualize create a plot of shaded cells. Before doing this we will however need to convert the data into a narrow format. The simplest way of going this is with the melt function that is contained within the reshape2 package.”

install.packages("reshape2")
library(reshape2)
# Converts from wide to narrow format
G_Index_Melt <- melt(G_Index, id.vars="SUPER")
# Views the top of the new narrow formatted data frame
head(G_Index_Melt)
##   SUPER variable value
## 1     A     k001    83
## 2     B     k001    90
## 3     C     k001   125
## 4     D     k001   121
## 5     E     k001    45
## 6     F     k001    35

“We then need to create a new variable that we will use on the plot to create color bins. It is common practice when building a geodemographic to create aggregations of less than or equal to 80, between 81 and 120 and greater than 120. We implement the recoding here with an ifelse() function; and then re-order the created factors so that they display from lowest to highest on the plot. Finally, we will import and merge a final column which is a set of shortened variable descriptions - however, if you want the full versions (which are too long for the plot) - remember these can be found in the”OAC_Input_Lookup" data frame."

# Recode the index scores into aggregate groupings
G_Index_Melt$band <- ifelse(G_Index_Melt$value <= 80,"< 80",ifelse(G_Index_Melt$value > 80 & G_Index_Melt$value <= 120,"80-120",">120"))

# Add a column with short descriptions of the variables
short <- read.csv("./data/OAC_Input_Lookup_short_labels.csv")
G_Index_Melt <- merge(G_Index_Melt,short,by.x="variable",by.y="VariableCode",all.x=TRUE)

# Order the created factors appropriately - needed to ensure the legend and axis make sense in ggolot2
G_Index_Melt$band <- factor(G_Index_Melt$band, levels = c("< 80","80-120",">120"))
G_Index_Melt$VariableDescription <- factor(G_Index_Melt$VariableDescription, levels = short$VariableDescription)

Using ggplot2, we now create a shaded table to come up with descriptions of the clusters (see description above).

library(ggplot2)
p <- ggplot(G_Index_Melt, aes(x=SUPER, y=VariableDescription, label=value, fill=band)) + 
  scale_fill_manual(name = "Band",values = c("#EB753B","#F7D865","#B3D09F")) +
  scale_x_discrete(position = "top") +
  geom_tile(alpha=0.8) +
  geom_text(colour="black")
p